{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "XML or text declaration not at start of entity: line 1, column 9\n",
      "mismatched tag: line 1, column 21\n",
      "written: 1000\n",
      "written: 2000\n",
      "written: 3000\n",
      "written: 4000\n",
      "written: 5000\n",
      "written: 6000\n",
      "written: 7000\n",
      "written: 8000\n",
      "written: 9000\n",
      "written: 10000\n",
      "written: 11000\n",
      "written: 12000\n",
      "written: 13000\n",
      "written: 14000\n",
      "written: 15000\n",
      "written: 16000\n",
      "written: 17000\n",
      "written: 18000\n",
      "written: 19000\n",
      "written: 20000\n",
      "written: 21000\n",
      "written: 22000\n",
      "written: 23000\n",
      "written: 24000\n",
      "written: 25000\n",
      "written: 26000\n",
      "written: 27000\n",
      "written: 28000\n",
      "written: 29000\n",
      "written: 30000\n",
      "written: 31000\n",
      "written: 32000\n",
      "written: 33000\n",
      "written: 34000\n",
      "written: 35000\n",
      "written: 36000\n",
      "written: 37000\n",
      "written: 38000\n",
      "written: 39000\n",
      "written: 40000\n",
      "written: 41000\n",
      "written: 42000\n",
      "written: 43000\n",
      "written: 44000\n",
      "written: 45000\n",
      "written: 46000\n",
      "written: 47000\n",
      "written: 48000\n",
      "written: 49000\n",
      "written: 50000\n",
      "written: 51000\n",
      "written: 52000\n",
      "written: 53000\n",
      "written: 54000\n",
      "written: 55000\n",
      "written: 56000\n",
      "written: 57000\n",
      "written: 58000\n",
      "written: 59000\n",
      "written: 60000\n",
      "written: 61000\n",
      "written: 62000\n",
      "written: 63000\n",
      "written: 64000\n",
      "written: 65000\n",
      "written: 66000\n",
      "written: 67000\n",
      "written: 68000\n",
      "written: 69000\n",
      "written: 70000\n",
      "written: 71000\n",
      "written: 72000\n",
      "written: 73000\n",
      "written: 74000\n",
      "written: 75000\n",
      "written: 76000\n",
      "written: 77000\n",
      "written: 78000\n",
      "written: 79000\n",
      "written: 80000\n",
      "written: 81000\n",
      "written: 82000\n",
      "written: 83000\n",
      "written: 84000\n",
      "written: 85000\n",
      "written: 86000\n",
      "written: 87000\n",
      "written: 88000\n",
      "written: 89000\n",
      "written: 90000\n",
      "written: 91000\n",
      "written: 92000\n",
      "written: 93000\n",
      "written: 94000\n",
      "written: 95000\n",
      "written: 96000\n",
      "written: 97000\n",
      "written: 98000\n",
      "written: 99000\n",
      "written: 100000\n",
      "written: 101000\n",
      "written: 102000\n",
      "written: 103000\n",
      "written: 104000\n",
      "written: 105000\n",
      "written: 106000\n",
      "written: 107000\n",
      "written: 108000\n",
      "written: 109000\n",
      "written: 110000\n",
      "written: 111000\n",
      "written: 112000\n",
      "written: 113000\n",
      "written: 114000\n",
      "written: 115000\n",
      "written: 116000\n",
      "written: 117000\n",
      "written: 118000\n",
      "written: 119000\n",
      "written: 120000\n",
      "written: 121000\n",
      "written: 122000\n",
      "written: 123000\n",
      "written: 124000\n",
      "written: 125000\n",
      "written: 126000\n",
      "written: 127000\n",
      "written: 128000\n",
      "written: 129000\n",
      "written: 130000\n",
      "written: 131000\n",
      "written: 132000\n",
      "written: 133000\n",
      "written: 134000\n",
      "written: 135000\n",
      "written: 136000\n",
      "written: 137000\n",
      "written: 138000\n",
      "written: 139000\n",
      "written: 140000\n",
      "written: 141000\n",
      "written: 142000\n",
      "written: 143000\n",
      "written: 144000\n",
      "written: 145000\n",
      "written: 146000\n",
      "written: 147000\n",
      "written: 148000\n",
      "written: 149000\n",
      "written: 150000\n",
      "written: 151000\n",
      "written: 152000\n",
      "written: 153000\n",
      "written: 154000\n",
      "written: 155000\n",
      "written: 156000\n",
      "written: 157000\n",
      "written: 158000\n",
      "written: 159000\n",
      "written: 160000\n",
      "written: 161000\n",
      "written: 162000\n",
      "written: 163000\n",
      "written: 164000\n",
      "written: 165000\n",
      "written: 166000\n",
      "written: 167000\n",
      "written: 168000\n",
      "written: 169000\n",
      "written: 170000\n",
      "written: 171000\n",
      "written: 172000\n",
      "written: 173000\n",
      "written: 174000\n",
      "written: 175000\n",
      "written: 176000\n",
      "written: 177000\n",
      "written: 178000\n",
      "written: 179000\n",
      "written: 180000\n",
      "written: 181000\n",
      "written: 182000\n",
      "written: 183000\n",
      "written: 184000\n",
      "written: 185000\n",
      "written: 186000\n",
      "written: 187000\n",
      "written: 188000\n",
      "written: 189000\n",
      "written: 190000\n",
      "written: 191000\n",
      "written: 192000\n",
      "written: 193000\n",
      "written: 194000\n",
      "written: 195000\n",
      "written: 196000\n",
      "written: 197000\n",
      "written: 198000\n",
      "written: 199000\n",
      "written: 200000\n",
      "written: 201000\n",
      "written: 202000\n",
      "written: 203000\n",
      "written: 204000\n",
      "written: 205000\n",
      "written: 206000\n",
      "written: 207000\n",
      "written: 208000\n",
      "written: 209000\n",
      "written: 210000\n",
      "written: 211000\n",
      "written: 212000\n",
      "written: 213000\n",
      "written: 214000\n",
      "written: 215000\n",
      "written: 216000\n",
      "written: 217000\n",
      "written: 218000\n",
      "written: 219000\n",
      "written: 220000\n",
      "written: 221000\n",
      "written: 222000\n",
      "written: 223000\n",
      "written: 224000\n",
      "written: 225000\n",
      "written: 226000\n",
      "written: 227000\n",
      "written: 228000\n",
      "written: 229000\n",
      "written: 230000\n",
      "written: 231000\n",
      "written: 232000\n",
      "written: 233000\n",
      "written: 234000\n",
      "written: 235000\n",
      "written: 236000\n",
      "written: 237000\n",
      "written: 238000\n",
      "written: 239000\n",
      "written: 240000\n",
      "written: 241000\n",
      "written: 242000\n",
      "written: 243000\n",
      "written: 244000\n",
      "written: 245000\n",
      "written: 246000\n",
      "written: 247000\n",
      "written: 248000\n",
      "written: 249000\n",
      "written: 250000\n",
      "written: 251000\n",
      "written: 252000\n",
      "written: 253000\n",
      "written: 254000\n",
      "written: 255000\n",
      "written: 256000\n",
      "written: 257000\n",
      "written: 258000\n",
      "written: 259000\n",
      "written: 260000\n",
      "written: 261000\n",
      "written: 262000\n",
      "written: 263000\n",
      "written: 264000\n",
      "written: 265000\n",
      "written: 266000\n",
      "written: 267000\n",
      "written: 268000\n",
      "written: 269000\n",
      "written: 270000\n",
      "written: 271000\n",
      "written: 272000\n",
      "written: 273000\n",
      "written: 274000\n",
      "written: 275000\n",
      "written: 276000\n",
      "written: 277000\n",
      "written: 278000\n",
      "written: 279000\n",
      "written: 280000\n",
      "written: 281000\n",
      "written: 282000\n",
      "written: 283000\n",
      "written: 284000\n",
      "written: 285000\n",
      "written: 286000\n",
      "written: 287000\n",
      "written: 288000\n",
      "written: 289000\n",
      "written: 290000\n",
      "written: 291000\n",
      "written: 292000\n",
      "written: 293000\n",
      "written: 294000\n",
      "written: 295000\n",
      "written: 296000\n",
      "written: 297000\n",
      "written: 298000\n",
      "written: 299000\n",
      "written: 300000\n",
      "written: 301000\n",
      "written: 302000\n",
      "written: 303000\n",
      "written: 304000\n",
      "written: 305000\n",
      "written: 306000\n",
      "written: 307000\n",
      "written: 308000\n",
      "written: 309000\n",
      "written: 310000\n",
      "written: 311000\n",
      "written: 312000\n",
      "written: 313000\n",
      "written: 314000\n",
      "written: 315000\n",
      "written: 316000\n",
      "written: 317000\n",
      "written: 318000\n",
      "written: 319000\n",
      "written: 320000\n",
      "written: 321000\n",
      "written: 322000\n",
      "written: 323000\n",
      "written: 324000\n",
      "written: 325000\n",
      "written: 326000\n",
      "written: 327000\n",
      "written: 328000\n",
      "written: 329000\n",
      "written: 330000\n",
      "written: 331000\n",
      "written: 332000\n",
      "written: 333000\n",
      "written: 334000\n",
      "written: 335000\n",
      "written: 336000\n",
      "written: 337000\n",
      "written: 338000\n",
      "written: 339000\n",
      "written: 340000\n",
      "written: 341000\n",
      "written: 342000\n",
      "written: 343000\n",
      "written: 344000\n",
      "written: 345000\n",
      "written: 346000\n",
      "written: 347000\n",
      "written: 348000\n",
      "written: 349000\n",
      "written: 350000\n",
      "written: 351000\n",
      "written: 352000\n",
      "written: 353000\n",
      "written: 354000\n",
      "written: 355000\n",
      "written: 356000\n",
      "written: 357000\n",
      "written: 358000\n",
      "written: 359000\n",
      "written: 360000\n",
      "written: 361000\n",
      "written: 362000\n",
      "written: 363000\n",
      "written: 364000\n",
      "written: 365000\n",
      "written: 366000\n",
      "written: 367000\n",
      "written: 368000\n",
      "written: 369000\n",
      "written: 370000\n",
      "written: 371000\n",
      "written: 372000\n",
      "written: 373000\n",
      "written: 374000\n",
      "written: 375000\n",
      "written: 376000\n",
      "written: 377000\n",
      "written: 378000\n",
      "written: 379000\n",
      "written: 380000\n",
      "written: 381000\n",
      "written: 382000\n",
      "written: 383000\n",
      "written: 384000\n",
      "written: 385000\n",
      "written: 386000\n",
      "written: 387000\n",
      "written: 388000\n",
      "written: 389000\n",
      "written: 390000\n",
      "written: 391000\n",
      "written: 392000\n",
      "written: 393000\n",
      "written: 394000\n",
      "written: 395000\n",
      "written: 396000\n",
      "written: 397000\n",
      "written: 398000\n",
      "written: 399000\n",
      "written: 400000\n",
      "written: 401000\n",
      "written: 402000\n",
      "written: 403000\n",
      "written: 404000\n",
      "written: 405000\n",
      "written: 406000\n",
      "written: 407000\n",
      "written: 408000\n",
      "written: 409000\n",
      "written: 410000\n",
      "written: 411000\n",
      "written: 412000\n",
      "written: 413000\n",
      "written: 414000\n",
      "written: 415000\n",
      "written: 416000\n",
      "written: 417000\n",
      "written: 418000\n",
      "written: 419000\n",
      "written: 420000\n",
      "written: 421000\n",
      "written: 422000\n",
      "written: 423000\n",
      "written: 424000\n",
      "written: 425000\n",
      "written: 426000\n",
      "written: 427000\n",
      "written: 428000\n",
      "written: 429000\n",
      "written: 430000\n",
      "written: 431000\n",
      "written: 432000\n",
      "written: 433000\n",
      "written: 434000\n",
      "written: 435000\n",
      "written: 436000\n",
      "written: 437000\n",
      "written: 438000\n",
      "written: 439000\n",
      "written: 440000\n",
      "written: 441000\n",
      "written: 442000\n",
      "written: 443000\n",
      "written: 444000\n",
      "written: 445000\n",
      "written: 446000\n",
      "written: 447000\n",
      "written: 448000\n",
      "written: 449000\n",
      "written: 450000\n",
      "written: 451000\n",
      "written: 452000\n",
      "written: 453000\n",
      "written: 454000\n",
      "written: 455000\n",
      "written: 456000\n",
      "written: 457000\n",
      "written: 458000\n",
      "written: 459000\n",
      "written: 460000\n",
      "written: 461000\n",
      "written: 462000\n",
      "written: 463000\n",
      "written: 464000\n",
      "written: 465000\n",
      "written: 466000\n",
      "written: 467000\n",
      "written: 468000\n",
      "written: 469000\n",
      "written: 470000\n",
      "written: 471000\n",
      "written: 472000\n",
      "written: 473000\n",
      "written: 474000\n",
      "written: 475000\n",
      "written: 476000\n",
      "written: 477000\n",
      "written: 478000\n",
      "written: 479000\n",
      "written: 480000\n",
      "written: 481000\n",
      "written: 482000\n",
      "written: 483000\n",
      "written: 484000\n",
      "written: 485000\n",
      "written: 486000\n",
      "written: 487000\n",
      "written: 488000\n",
      "written: 489000\n",
      "written: 490000\n",
      "written: 491000\n",
      "written: 492000\n",
      "written: 493000\n",
      "written: 494000\n",
      "written: 495000\n",
      "written: 496000\n",
      "mismatched tag: line 1, column 11\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>url</th>\n",
       "      <th>title</th>\n",
       "      <th>image</th>\n",
       "      <th>category</th>\n",
       "      <th>description</th>\n",
       "      <th>rank</th>\n",
       "      <th>pubdate</th>\n",
       "      <th>video</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [source, url, title, image, category, description, rank, pubdate, video]\n",
       "Index: []"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import xml.etree.ElementTree as ElementTree\n",
    "import pandas as pd\n",
    "import csv\n",
    "\n",
    "dfcols = ['source', \n",
    "          'url', \n",
    "          'title', \n",
    "          'image',\n",
    "          'category',\n",
    "          'description',\n",
    "          'rank',\n",
    "          'pubdate',\n",
    "          'video'\n",
    "         ]\n",
    "\n",
    "def getvalueofnode(node):\n",
    "    \"\"\" return node text or None \"\"\"\n",
    "    return node.text if node is not None else None\n",
    "\n",
    "count = 0\n",
    "with open(\"newsspace200.xml\") as file:\n",
    "# with open(\"newsSpace.sample\") as file:\n",
    "    with open(\"newsSpace.parsed\", \"w\") as dest:\n",
    "        wr = csv.writer(dest)\n",
    "        for line in file:\n",
    "            try:\n",
    "                line = \"<example>\"+line[:-1]+\"</example>\"\n",
    "                root = ElementTree.fromstring(line)\n",
    "                \n",
    "                source = root.find('source')\n",
    "                url = root.find('url')\n",
    "                title = root.find('title')\n",
    "                image = root.find('image')\n",
    "                category = root.find('category')\n",
    "                description = root.find('description')\n",
    "                rank = root.find('rank')\n",
    "                pubdate = root.find('pubdate')\n",
    "                video = root.find('video')\n",
    "                \n",
    "                csv_line = [getvalueofnode(source),\n",
    "                                getvalueofnode(url),\n",
    "                                getvalueofnode(title),\n",
    "                                getvalueofnode(image),\n",
    "                                getvalueofnode(category),\n",
    "                                getvalueofnode(description),\n",
    "                                getvalueofnode(rank),\n",
    "                                getvalueofnode(pubdate),\n",
    "                                getvalueofnode(video)\n",
    "                              ]\n",
    "                wr.writerow(csv_line)\n",
    "                count = count + 1\n",
    "                if count % 50000 == 0:\n",
    "                    print(\"written:\", count)\n",
    "            except Exception as e: \n",
    "                print(e)\n",
    "                continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/lapoloni/env/virtual_env/power_overwhelming/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2728: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>url</th>\n",
       "      <th>title</th>\n",
       "      <th>image</th>\n",
       "      <th>category</th>\n",
       "      <th>description</th>\n",
       "      <th>rank</th>\n",
       "      <th>pubdate</th>\n",
       "      <th>video</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Wall St. Pullback Reflects Tech Blowout (Reuters)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Wall Street's long-playing drama,\\\"W...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Wall St. Bears Claw Back Into the Black (Reuters)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Short-sellers, Wall Street's dwindli...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Carlyle Looks Toward Commercial Aerospace (Reu...</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Private investment firm Carlyle Grou...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Oil and Economy Cloud Stocks' Outlook (Reuters)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Soaring crude prices plus worries\\ab...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Iraq Halts Oil Exports from Main Southern Pipe...</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Authorities have halted oil export\\f...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Oil prices soar to all-time record, posing new...</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>AFP - Tearaway world oil prices, toppling reco...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Stocks End Up, But Near Year Lows (Reuters)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Stocks ended slightly higher on Frid...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Money Funds Fell in Latest Week (AP)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>AP - Assets of the nation's retail money marke...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Fed minutes show dissent over inflation (USATO...</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>USATODAY.com - Retail sales bounced back a bit...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Safety Net (Forbes.com)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Forbes.com - After earning a PH.D. in Sociolog...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Uprising Keeps Iraq Oil Exports Halved</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>BAGHDAD (Reuters) - Iraq's oil exports were s...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Wall St. Pullback Reflects Tech Blowout</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK (Reuters) - Wall Street's long-playi...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:24</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Wall St. Bears Claw Back Into the Black</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK (Reuters) - Short-sellers, Wall Stre...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Carlyle Looks Toward Commercial Aerospace</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK (Reuters) - Private investment firm ...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:27</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Oil and Economy Cloud Stocks' Outlook</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK (Reuters) - Soaring crude prices plu...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>GlaxoSmithKline: CFO to Retire in March</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>LONDON (Reuters) - GlaxoSmithKline &lt;GSK.L&gt; Ch...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:29</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>No Need for OPEC to Pump More-Iran Gov</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>TEHRAN (Reuters) - OPEC can do nothing to dou...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:32</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Non-OPEC Nations Should Up Output-Purnomo</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>JAKARTA (Reuters) - Non-OPEC oil exporters sh...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:34</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Google IPO Auction Off to Rocky Start</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>WASHINGTON/NEW YORK (Reuters) - The auction f...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Dollar Falls Broadly on Record Trade Gap</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK (Reuters) - The dollar tumbled broad...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:37</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Washington Post Business</td>\n",
       "      <td>http://www.washingtonpost.com/wp-dyn/articles/...</td>\n",
       "      <td>Rescuing an Old Saver</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>If you think you may need to help your elderly...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:40</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Washington Post Business</td>\n",
       "      <td>http://www.washingtonpost.com/wp-dyn/articles/...</td>\n",
       "      <td>Kids Rule for Back-to-School</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>The purchasing power of kids is a big part of ...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:43</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Washington Post Business</td>\n",
       "      <td>http://www.washingtonpost.com/wp-dyn/articles/...</td>\n",
       "      <td>In a Down Market, Head Toward Value Funds</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>There is little cause for celebration in the s...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:46</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>US trade deficit swells in June</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>The US trade deficit has exploded 19 to a reco...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Crude oil prices jump to new high</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>The price of crude oil reaches new record leve...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Shell 'could be target for Total'</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Oil giant Shell could be bracing itself for a ...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Google IPO faces Playboy slip-up</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>The bidding gets underway for Google's public ...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Eurozone economy keeps growing</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Official figures show the 12-nation eurozone e...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Swatch dismisses tax allegations</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Swiss watchmaker Swatch, official timekeeper t...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Expansion slows in Japan</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Economic growth in Japan slows down as the cou...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496805</th>\n",
       "      <td>Reuters World</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Israel Kills Two Gaza Gunmen - Medics</td>\n",
       "      <td>http___ad.doubleclick.net_ad_reuters.com.dart_...</td>\n",
       "      <td>World</td>\n",
       "      <td>GAZA (Reuters) - Israeli soldiers shot dead t...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496806</th>\n",
       "      <td>BBC - Front Page</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Iraqi judges quiz 'Chemical Ali'</td>\n",
       "      <td>http___newsimg.bbc.co.uk_media_images_39639000...</td>\n",
       "      <td>Top News</td>\n",
       "      <td>Ali Hassan al-Majid - widely known as Chemical...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496807</th>\n",
       "      <td>Reuters World</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Pakistan's Musharraf Says Won't Quit as Army C...</td>\n",
       "      <td>http___ad.doubleclick.net_ad_reuters.com.dart_...</td>\n",
       "      <td>World</td>\n",
       "      <td>KARACHI (Reuters) - Pakistani President Perve...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-12-18 15:59:39</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496808</th>\n",
       "      <td>Reuters World</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Philippines Begins to Rebuild Flood-Hit Areas</td>\n",
       "      <td>http___wwwi.reuters.com_images_w148_amdf798539...</td>\n",
       "      <td>World</td>\n",
       "      <td>MANILA (Reuters) - Hundreds of army and civil...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496809</th>\n",
       "      <td>Reuters World</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Scant Progress on Post-Kyoto as Climate Talks End</td>\n",
       "      <td>http___ad.doubleclick.net_ad_reuters.com.dart_...</td>\n",
       "      <td>World</td>\n",
       "      <td>BUENOS AIRES, Argentina (Reuters) - U.N. talk...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496810</th>\n",
       "      <td>New York Times sports</td>\n",
       "      <td>http://www.boston.com/sports/baseball/redsox/a...</td>\n",
       "      <td>Answer coming at first</td>\n",
       "      <td>http___cache.boston.com_bonzai-fba_File-Based_...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>Red Sox general manager Theo Epstein yesterday...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-12-18 16:19:49</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496811</th>\n",
       "      <td>New York Times sports</td>\n",
       "      <td>http://www.boston.com/sports/baseball/redsox/a...</td>\n",
       "      <td>Renteria signing a top-shelf deal</td>\n",
       "      <td>http___cache.boston.com_bonzai-fba_File-Based_...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>Red Sox general manager Theo Epstein acknowled...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-12-18 16:15:38</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496812</th>\n",
       "      <td>BBC - Front Page</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Potter shortlisted for FX Oscar</td>\n",
       "      <td>http___newsimg.bbc.co.uk_media_images_40194000...</td>\n",
       "      <td>Top News</td>\n",
       "      <td>The third Harry Potter film and Spider-Man 2 a...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-12-18 15:28:30</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496813</th>\n",
       "      <td>New York Times sports</td>\n",
       "      <td>http://www.boston.com/sports/baseball/articles...</td>\n",
       "      <td>Report: Johnson deal is complete</td>\n",
       "      <td>http___cache.boston.com_bonzai-fba_File-Based_...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>The Big Unit is coming to the Yankees. After d...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496814</th>\n",
       "      <td>CNN Top News</td>\n",
       "      <td>http://www.newsisfree.com/iclick/i,65217888,23...</td>\n",
       "      <td>Baby found alive; woman arrested</td>\n",
       "      <td>none</td>\n",
       "      <td>Top News</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496815</th>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496816</th>\n",
       "      <td>CNN Top News</td>\n",
       "      <td>http://www.newsisfree.com/iclick/i,65217889,23...</td>\n",
       "      <td>Little agreed as climate talks end</td>\n",
       "      <td>none</td>\n",
       "      <td>Top News</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496817</th>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496818</th>\n",
       "      <td>CNN Top News</td>\n",
       "      <td>http://www.newsisfree.com/iclick/i,65217890,23...</td>\n",
       "      <td>Clouds, dry weather prevail in most of U.S.</td>\n",
       "      <td>none</td>\n",
       "      <td>Top News</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496819</th>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496820</th>\n",
       "      <td>New York Times sports</td>\n",
       "      <td>http://www.boston.com/sports/football/articles...</td>\n",
       "      <td>Saban not going to Dolphins yet</td>\n",
       "      <td>http___cache.boston.com_bonzai-fba_File-Based_...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>The Miami Dolphins will put their courtship of...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496821</th>\n",
       "      <td>CNN Top News</td>\n",
       "      <td>http://www.newsisfree.com/iclick/i,65217891,23...</td>\n",
       "      <td>Little agreed to as climate talks end</td>\n",
       "      <td>none</td>\n",
       "      <td>Top News</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496822</th>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496823</th>\n",
       "      <td>CNN Top News</td>\n",
       "      <td>http://www.edapebaf.com/clickthrough.cool?db=c...</td>\n",
       "      <td>ADV: \\$150,000 Mortgage for Under \\$690/Month</td>\n",
       "      <td>none</td>\n",
       "      <td>Top News</td>\n",
       "      <td>Mortgage rates are at record lows. Save \\$1000...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-12-18 16:17:10</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496824</th>\n",
       "      <td>CNN Top News</td>\n",
       "      <td>http://www.newsisfree.com/iclick/i,65212552,23...</td>\n",
       "      <td>Ecstatic return home for Turkey PM</td>\n",
       "      <td>none</td>\n",
       "      <td>Top News</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496825</th>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-12-18 15:55:14</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496826</th>\n",
       "      <td>New York Times sports</td>\n",
       "      <td>http://www.boston.com/sports/football/articles...</td>\n",
       "      <td>Today's NFL games</td>\n",
       "      <td>http___cache.boston.com_bonzai-fba_File-Based_...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496827</th>\n",
       "      <td>New York Times sports</td>\n",
       "      <td>http://www.boston.com/sports/basketball/articl...</td>\n",
       "      <td>Shake-up in the East</td>\n",
       "      <td>http___cache.boston.com_bonzai-fba_File-Based_...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>Yesterday will go down as Liberation Day in th...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-12-18 15:55:57</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496828</th>\n",
       "      <td>New York Times sports</td>\n",
       "      <td>http://www.boston.com/sports/basketball/articl...</td>\n",
       "      <td>Knicks need to go extra mile to beat Philadelphia</td>\n",
       "      <td>http___cache.boston.com_bonzai-fba_File-Based_...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>Allan Houston hit the tying 3-pointer with 0.4...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496829</th>\n",
       "      <td>New York Times sports</td>\n",
       "      <td>http://www.boston.com/sports/basketball/celtic...</td>\n",
       "      <td>High on priority list: Home improvement</td>\n",
       "      <td>http___cache.boston.com_bonzai-fba_File-Based_...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>Doc Rivers knows any postseason plans hinge on...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-12-18 16:14:36</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496830</th>\n",
       "      <td>BBC News world</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Compromise seals climate meeting</td>\n",
       "      <td>http___newsimg.bbc.co.uk_media_images_40643000...</td>\n",
       "      <td>World</td>\n",
       "      <td>A climate conference overcomes last-minute obj...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496831</th>\n",
       "      <td>New York Times sports</td>\n",
       "      <td>http://www.boston.com/sports/basketball/celtic...</td>\n",
       "      <td>Eisley enjoying his point of view</td>\n",
       "      <td>http___cache.boston.com_bonzai-fba_File-Based_...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>Howard Eisley has fond memories of Boston. He ...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496832</th>\n",
       "      <td>BBC News world</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Iraqi judges quiz 'Chemical Ali'</td>\n",
       "      <td>http___newsimg.bbc.co.uk_media_images_39639000...</td>\n",
       "      <td>World</td>\n",
       "      <td>Ali Hassan al-Majid - widely known as Chemical...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496833</th>\n",
       "      <td>New York Times sports</td>\n",
       "      <td>http://www.boston.com/sports/basketball/articl...</td>\n",
       "      <td>Nets get Carter from Raptors</td>\n",
       "      <td>http___cache.boston.com_bonzai-fba_File-Based_...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>INDIANAPOLIS -- All-Star Vince Carter was trad...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496834</th>\n",
       "      <td>BBC</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>496835 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                          source  \\\n",
       "0                 Yahoo Business   \n",
       "1                 Yahoo Business   \n",
       "2                 Yahoo Business   \n",
       "3                 Yahoo Business   \n",
       "4                 Yahoo Business   \n",
       "5                 Yahoo Business   \n",
       "6                 Yahoo Business   \n",
       "7                 Yahoo Business   \n",
       "8                 Yahoo Business   \n",
       "9                 Yahoo Business   \n",
       "10              Reuters Business   \n",
       "11              Reuters Business   \n",
       "12              Reuters Business   \n",
       "13              Reuters Business   \n",
       "14              Reuters Business   \n",
       "15              Reuters Business   \n",
       "16              Reuters Business   \n",
       "17              Reuters Business   \n",
       "18              Reuters Business   \n",
       "19              Reuters Business   \n",
       "20      Washington Post Business   \n",
       "21      Washington Post Business   \n",
       "22      Washington Post Business   \n",
       "23             BBC News Business   \n",
       "24             BBC News Business   \n",
       "25             BBC News Business   \n",
       "26             BBC News Business   \n",
       "27             BBC News Business   \n",
       "28             BBC News Business   \n",
       "29             BBC News Business   \n",
       "...                          ...   \n",
       "496805             Reuters World   \n",
       "496806          BBC - Front Page   \n",
       "496807             Reuters World   \n",
       "496808             Reuters World   \n",
       "496809             Reuters World   \n",
       "496810     New York Times sports   \n",
       "496811     New York Times sports   \n",
       "496812          BBC - Front Page   \n",
       "496813     New York Times sports   \n",
       "496814              CNN Top News   \n",
       "496815                       NaN   \n",
       "496816              CNN Top News   \n",
       "496817                       NaN   \n",
       "496818              CNN Top News   \n",
       "496819                       NaN   \n",
       "496820     New York Times sports   \n",
       "496821              CNN Top News   \n",
       "496822                       NaN   \n",
       "496823              CNN Top News   \n",
       "496824              CNN Top News   \n",
       "496825                       NaN   \n",
       "496826     New York Times sports   \n",
       "496827     New York Times sports   \n",
       "496828     New York Times sports   \n",
       "496829     New York Times sports   \n",
       "496830            BBC News world   \n",
       "496831     New York Times sports   \n",
       "496832            BBC News world   \n",
       "496833     New York Times sports   \n",
       "496834                      BBC    \n",
       "\n",
       "                                                      url  \\\n",
       "0       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "1       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "2       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "3       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "4       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "5       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "6       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "7       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "8       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "9       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "10      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "11      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "12      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "13      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "14      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "15      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "16      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "17      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "18      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "19      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "20      http://www.washingtonpost.com/wp-dyn/articles/...   \n",
       "21      http://www.washingtonpost.com/wp-dyn/articles/...   \n",
       "22      http://www.washingtonpost.com/wp-dyn/articles/...   \n",
       "23      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "24      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "25      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "26      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "27      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "28      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "29      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "...                                                   ...   \n",
       "496805  http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "496806  http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "496807  http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "496808  http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "496809  http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "496810  http://www.boston.com/sports/baseball/redsox/a...   \n",
       "496811  http://www.boston.com/sports/baseball/redsox/a...   \n",
       "496812  http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "496813  http://www.boston.com/sports/baseball/articles...   \n",
       "496814  http://www.newsisfree.com/iclick/i,65217888,23...   \n",
       "496815                                                  5   \n",
       "496816  http://www.newsisfree.com/iclick/i,65217889,23...   \n",
       "496817                                                  5   \n",
       "496818  http://www.newsisfree.com/iclick/i,65217890,23...   \n",
       "496819                                                  5   \n",
       "496820  http://www.boston.com/sports/football/articles...   \n",
       "496821  http://www.newsisfree.com/iclick/i,65217891,23...   \n",
       "496822                                                  5   \n",
       "496823  http://www.edapebaf.com/clickthrough.cool?db=c...   \n",
       "496824  http://www.newsisfree.com/iclick/i,65212552,23...   \n",
       "496825                                                  5   \n",
       "496826  http://www.boston.com/sports/football/articles...   \n",
       "496827  http://www.boston.com/sports/basketball/articl...   \n",
       "496828  http://www.boston.com/sports/basketball/articl...   \n",
       "496829  http://www.boston.com/sports/basketball/celtic...   \n",
       "496830  http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "496831  http://www.boston.com/sports/basketball/celtic...   \n",
       "496832  http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "496833  http://www.boston.com/sports/basketball/articl...   \n",
       "496834                                                NaN   \n",
       "\n",
       "                                                    title  \\\n",
       "0       Wall St. Pullback Reflects Tech Blowout (Reuters)   \n",
       "1       Wall St. Bears Claw Back Into the Black (Reuters)   \n",
       "2       Carlyle Looks Toward Commercial Aerospace (Reu...   \n",
       "3         Oil and Economy Cloud Stocks' Outlook (Reuters)   \n",
       "4       Iraq Halts Oil Exports from Main Southern Pipe...   \n",
       "5       Oil prices soar to all-time record, posing new...   \n",
       "6             Stocks End Up, But Near Year Lows (Reuters)   \n",
       "7                    Money Funds Fell in Latest Week (AP)   \n",
       "8       Fed minutes show dissent over inflation (USATO...   \n",
       "9                                 Safety Net (Forbes.com)   \n",
       "10                 Uprising Keeps Iraq Oil Exports Halved   \n",
       "11                Wall St. Pullback Reflects Tech Blowout   \n",
       "12                Wall St. Bears Claw Back Into the Black   \n",
       "13              Carlyle Looks Toward Commercial Aerospace   \n",
       "14                  Oil and Economy Cloud Stocks' Outlook   \n",
       "15                GlaxoSmithKline: CFO to Retire in March   \n",
       "16                 No Need for OPEC to Pump More-Iran Gov   \n",
       "17              Non-OPEC Nations Should Up Output-Purnomo   \n",
       "18                  Google IPO Auction Off to Rocky Start   \n",
       "19               Dollar Falls Broadly on Record Trade Gap   \n",
       "20                                  Rescuing an Old Saver   \n",
       "21                           Kids Rule for Back-to-School   \n",
       "22              In a Down Market, Head Toward Value Funds   \n",
       "23                        US trade deficit swells in June   \n",
       "24                      Crude oil prices jump to new high   \n",
       "25                      Shell 'could be target for Total'   \n",
       "26                       Google IPO faces Playboy slip-up   \n",
       "27                         Eurozone economy keeps growing   \n",
       "28                       Swatch dismisses tax allegations   \n",
       "29                               Expansion slows in Japan   \n",
       "...                                                   ...   \n",
       "496805              Israel Kills Two Gaza Gunmen - Medics   \n",
       "496806                   Iraqi judges quiz 'Chemical Ali'   \n",
       "496807  Pakistan's Musharraf Says Won't Quit as Army C...   \n",
       "496808      Philippines Begins to Rebuild Flood-Hit Areas   \n",
       "496809  Scant Progress on Post-Kyoto as Climate Talks End   \n",
       "496810                             Answer coming at first   \n",
       "496811                  Renteria signing a top-shelf deal   \n",
       "496812                    Potter shortlisted for FX Oscar   \n",
       "496813                   Report: Johnson deal is complete   \n",
       "496814                   Baby found alive; woman arrested   \n",
       "496815                                0000-00-00 00:00:00   \n",
       "496816                 Little agreed as climate talks end   \n",
       "496817                                0000-00-00 00:00:00   \n",
       "496818        Clouds, dry weather prevail in most of U.S.   \n",
       "496819                                0000-00-00 00:00:00   \n",
       "496820                    Saban not going to Dolphins yet   \n",
       "496821              Little agreed to as climate talks end   \n",
       "496822                                0000-00-00 00:00:00   \n",
       "496823      ADV: \\$150,000 Mortgage for Under \\$690/Month   \n",
       "496824                 Ecstatic return home for Turkey PM   \n",
       "496825                                2004-12-18 15:55:14   \n",
       "496826                                  Today's NFL games   \n",
       "496827                               Shake-up in the East   \n",
       "496828  Knicks need to go extra mile to beat Philadelphia   \n",
       "496829            High on priority list: Home improvement   \n",
       "496830                   Compromise seals climate meeting   \n",
       "496831                  Eisley enjoying his point of view   \n",
       "496832                   Iraqi judges quiz 'Chemical Ali'   \n",
       "496833                       Nets get Carter from Raptors   \n",
       "496834                                                NaN   \n",
       "\n",
       "                                                    image  category  \\\n",
       "0                                                    none  Business   \n",
       "1                                                    none  Business   \n",
       "2                                                    none  Business   \n",
       "3                                                    none  Business   \n",
       "4                                                    none  Business   \n",
       "5                                                    none  Business   \n",
       "6                                                    none  Business   \n",
       "7                                                    none  Business   \n",
       "8                                                    none  Business   \n",
       "9                                                    none  Business   \n",
       "10                                                   none  Business   \n",
       "11                                                   none  Business   \n",
       "12                                                   none  Business   \n",
       "13                                                   none  Business   \n",
       "14                                                   none  Business   \n",
       "15                                                   none  Business   \n",
       "16                                                   none  Business   \n",
       "17                                                   none  Business   \n",
       "18                                                   none  Business   \n",
       "19                                                   none  Business   \n",
       "20                                                   none  Business   \n",
       "21                                                   none  Business   \n",
       "22                                                   none  Business   \n",
       "23                                                   none  Business   \n",
       "24                                                   none  Business   \n",
       "25                                                   none  Business   \n",
       "26                                                   none  Business   \n",
       "27                                                   none  Business   \n",
       "28                                                   none  Business   \n",
       "29                                                   none  Business   \n",
       "...                                                   ...       ...   \n",
       "496805  http___ad.doubleclick.net_ad_reuters.com.dart_...     World   \n",
       "496806  http___newsimg.bbc.co.uk_media_images_39639000...  Top News   \n",
       "496807  http___ad.doubleclick.net_ad_reuters.com.dart_...     World   \n",
       "496808  http___wwwi.reuters.com_images_w148_amdf798539...     World   \n",
       "496809  http___ad.doubleclick.net_ad_reuters.com.dart_...     World   \n",
       "496810  http___cache.boston.com_bonzai-fba_File-Based_...    Sports   \n",
       "496811  http___cache.boston.com_bonzai-fba_File-Based_...    Sports   \n",
       "496812  http___newsimg.bbc.co.uk_media_images_40194000...  Top News   \n",
       "496813  http___cache.boston.com_bonzai-fba_File-Based_...    Sports   \n",
       "496814                                               none  Top News   \n",
       "496815                                                NaN       NaN   \n",
       "496816                                               none  Top News   \n",
       "496817                                                NaN       NaN   \n",
       "496818                                               none  Top News   \n",
       "496819                                                NaN       NaN   \n",
       "496820  http___cache.boston.com_bonzai-fba_File-Based_...    Sports   \n",
       "496821                                               none  Top News   \n",
       "496822                                                NaN       NaN   \n",
       "496823                                               none  Top News   \n",
       "496824                                               none  Top News   \n",
       "496825                                                NaN       NaN   \n",
       "496826  http___cache.boston.com_bonzai-fba_File-Based_...    Sports   \n",
       "496827  http___cache.boston.com_bonzai-fba_File-Based_...    Sports   \n",
       "496828  http___cache.boston.com_bonzai-fba_File-Based_...    Sports   \n",
       "496829  http___cache.boston.com_bonzai-fba_File-Based_...    Sports   \n",
       "496830  http___newsimg.bbc.co.uk_media_images_40643000...     World   \n",
       "496831  http___cache.boston.com_bonzai-fba_File-Based_...    Sports   \n",
       "496832  http___newsimg.bbc.co.uk_media_images_39639000...     World   \n",
       "496833  http___cache.boston.com_bonzai-fba_File-Based_...    Sports   \n",
       "496834                                                NaN       NaN   \n",
       "\n",
       "                                              description rank  \\\n",
       "0       Reuters - Wall Street's long-playing drama,\\\"W...    5   \n",
       "1       Reuters - Short-sellers, Wall Street's dwindli...    5   \n",
       "2       Reuters - Private investment firm Carlyle Grou...    5   \n",
       "3       Reuters - Soaring crude prices plus worries\\ab...    5   \n",
       "4       Reuters - Authorities have halted oil export\\f...    5   \n",
       "5       AFP - Tearaway world oil prices, toppling reco...    5   \n",
       "6       Reuters - Stocks ended slightly higher on Frid...    5   \n",
       "7       AP - Assets of the nation's retail money marke...    5   \n",
       "8       USATODAY.com - Retail sales bounced back a bit...    5   \n",
       "9       Forbes.com - After earning a PH.D. in Sociolog...    5   \n",
       "10       BAGHDAD (Reuters) - Iraq's oil exports were s...    5   \n",
       "11       NEW YORK (Reuters) - Wall Street's long-playi...    5   \n",
       "12       NEW YORK (Reuters) - Short-sellers, Wall Stre...    5   \n",
       "13       NEW YORK (Reuters) - Private investment firm ...    5   \n",
       "14       NEW YORK (Reuters) - Soaring crude prices plu...    5   \n",
       "15       LONDON (Reuters) - GlaxoSmithKline <GSK.L> Ch...    5   \n",
       "16       TEHRAN (Reuters) - OPEC can do nothing to dou...    5   \n",
       "17       JAKARTA (Reuters) - Non-OPEC oil exporters sh...    5   \n",
       "18       WASHINGTON/NEW YORK (Reuters) - The auction f...    5   \n",
       "19       NEW YORK (Reuters) - The dollar tumbled broad...    5   \n",
       "20      If you think you may need to help your elderly...    5   \n",
       "21      The purchasing power of kids is a big part of ...    5   \n",
       "22      There is little cause for celebration in the s...    5   \n",
       "23      The US trade deficit has exploded 19 to a reco...    5   \n",
       "24      The price of crude oil reaches new record leve...    5   \n",
       "25      Oil giant Shell could be bracing itself for a ...    5   \n",
       "26      The bidding gets underway for Google's public ...    5   \n",
       "27      Official figures show the 12-nation eurozone e...    5   \n",
       "28      Swiss watchmaker Swatch, official timekeeper t...    5   \n",
       "29      Economic growth in Japan slows down as the cou...    5   \n",
       "...                                                   ...  ...   \n",
       "496805   GAZA (Reuters) - Israeli soldiers shot dead t...    5   \n",
       "496806  Ali Hassan al-Majid - widely known as Chemical...    5   \n",
       "496807   KARACHI (Reuters) - Pakistani President Perve...    5   \n",
       "496808   MANILA (Reuters) - Hundreds of army and civil...    5   \n",
       "496809   BUENOS AIRES, Argentina (Reuters) - U.N. talk...    5   \n",
       "496810  Red Sox general manager Theo Epstein yesterday...    5   \n",
       "496811  Red Sox general manager Theo Epstein acknowled...    5   \n",
       "496812  The third Harry Potter film and Spider-Man 2 a...    5   \n",
       "496813  The Big Unit is coming to the Yankees. After d...    5   \n",
       "496814                                                NaN  NaN   \n",
       "496815                                                NaN  NaN   \n",
       "496816                                                NaN  NaN   \n",
       "496817                                                NaN  NaN   \n",
       "496818                                                NaN  NaN   \n",
       "496819                                                NaN  NaN   \n",
       "496820  The Miami Dolphins will put their courtship of...    5   \n",
       "496821                                                NaN  NaN   \n",
       "496822                                                NaN  NaN   \n",
       "496823  Mortgage rates are at record lows. Save \\$1000...    5   \n",
       "496824                                                NaN  NaN   \n",
       "496825                                                NaN  NaN   \n",
       "496826  PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...    5   \n",
       "496827  Yesterday will go down as Liberation Day in th...    5   \n",
       "496828  Allan Houston hit the tying 3-pointer with 0.4...    5   \n",
       "496829  Doc Rivers knows any postseason plans hinge on...    5   \n",
       "496830  A climate conference overcomes last-minute obj...    5   \n",
       "496831  Howard Eisley has fond memories of Boston. He ...    5   \n",
       "496832  Ali Hassan al-Majid - widely known as Chemical...    5   \n",
       "496833  INDIANAPOLIS -- All-Star Vince Carter was trad...    5   \n",
       "496834                                                NaN  NaN   \n",
       "\n",
       "                    pubdate  video  \n",
       "0       0000-00-00 00:00:00    NaN  \n",
       "1       0000-00-00 00:00:00    NaN  \n",
       "2       0000-00-00 00:00:00    NaN  \n",
       "3       0000-00-00 00:00:00    NaN  \n",
       "4       0000-00-00 00:00:00    NaN  \n",
       "5       0000-00-00 00:00:00    NaN  \n",
       "6       0000-00-00 00:00:00    NaN  \n",
       "7       0000-00-00 00:00:00    NaN  \n",
       "8       0000-00-00 00:00:00    NaN  \n",
       "9       0000-00-00 00:00:00    NaN  \n",
       "10      0000-00-00 00:00:00    NaN  \n",
       "11      2004-08-18 22:53:24    NaN  \n",
       "12      0000-00-00 00:00:00    NaN  \n",
       "13      2004-08-18 22:53:27    NaN  \n",
       "14      0000-00-00 00:00:00    NaN  \n",
       "15      2004-08-18 22:53:29    NaN  \n",
       "16      2004-08-18 22:53:32    NaN  \n",
       "17      2004-08-18 22:53:34    NaN  \n",
       "18      0000-00-00 00:00:00    NaN  \n",
       "19      2004-08-18 22:53:37    NaN  \n",
       "20      2004-08-18 22:53:40    NaN  \n",
       "21      2004-08-18 22:53:43    NaN  \n",
       "22      2004-08-18 22:53:46    NaN  \n",
       "23      0000-00-00 00:00:00    NaN  \n",
       "24      0000-00-00 00:00:00    NaN  \n",
       "25      0000-00-00 00:00:00    NaN  \n",
       "26      0000-00-00 00:00:00    NaN  \n",
       "27      0000-00-00 00:00:00    NaN  \n",
       "28      0000-00-00 00:00:00    NaN  \n",
       "29      0000-00-00 00:00:00    NaN  \n",
       "...                     ...    ...  \n",
       "496805  0000-00-00 00:00:00    NaN  \n",
       "496806  0000-00-00 00:00:00    NaN  \n",
       "496807  2004-12-18 15:59:39    NaN  \n",
       "496808  0000-00-00 00:00:00    NaN  \n",
       "496809  0000-00-00 00:00:00    NaN  \n",
       "496810  2004-12-18 16:19:49    NaN  \n",
       "496811  2004-12-18 16:15:38    NaN  \n",
       "496812  2004-12-18 15:28:30    NaN  \n",
       "496813  0000-00-00 00:00:00    NaN  \n",
       "496814                  NaN    NaN  \n",
       "496815                  NaN    NaN  \n",
       "496816                  NaN    NaN  \n",
       "496817                  NaN    NaN  \n",
       "496818                  NaN    NaN  \n",
       "496819                  NaN    NaN  \n",
       "496820  0000-00-00 00:00:00    NaN  \n",
       "496821                  NaN    NaN  \n",
       "496822                  NaN    NaN  \n",
       "496823  2004-12-18 16:17:10    NaN  \n",
       "496824                  NaN    NaN  \n",
       "496825                  NaN    NaN  \n",
       "496826  0000-00-00 00:00:00    NaN  \n",
       "496827  2004-12-18 15:55:57    NaN  \n",
       "496828  0000-00-00 00:00:00    NaN  \n",
       "496829  2004-12-18 16:14:36    NaN  \n",
       "496830  0000-00-00 00:00:00    NaN  \n",
       "496831  0000-00-00 00:00:00    NaN  \n",
       "496832  0000-00-00 00:00:00    NaN  \n",
       "496833  0000-00-00 00:00:00    NaN  \n",
       "496834                  NaN    NaN  \n",
       "\n",
       "[496835 rows x 9 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newsspace = pd.read_csv(\"newsSpace.parsed\", header=None, names=dfcols)\n",
    "newsspace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "newsspace[\"feature\"] = newsspace.title + \"|\" + newsspace.description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0            World\n",
       "1    Entertainment\n",
       "2           Sports\n",
       "3         Business\n",
       "Name: category, dtype: object"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classes = newsspace.groupby([\"category\"]).count()[[\"source\"]].sort_values(\"source\", ascending=False).reset_index()\n",
    "top4_classes = classes.head(4)[\"category\"]\n",
    "top4_classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>url</th>\n",
       "      <th>title</th>\n",
       "      <th>image</th>\n",
       "      <th>category</th>\n",
       "      <th>description</th>\n",
       "      <th>rank</th>\n",
       "      <th>pubdate</th>\n",
       "      <th>video</th>\n",
       "      <th>feature</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Wall St. Pullback Reflects Tech Blowout (Reuters)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Wall Street's long-playing drama,\\\"W...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Wall St. Pullback Reflects Tech Blowout (Reute...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Wall St. Bears Claw Back Into the Black (Reuters)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Short-sellers, Wall Street's dwindli...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Wall St. Bears Claw Back Into the Black (Reute...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Carlyle Looks Toward Commercial Aerospace (Reu...</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Private investment firm Carlyle Grou...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Carlyle Looks Toward Commercial Aerospace (Reu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Oil and Economy Cloud Stocks' Outlook (Reuters)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Soaring crude prices plus worries\\ab...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Oil and Economy Cloud Stocks' Outlook (Reuters...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Iraq Halts Oil Exports from Main Southern Pipe...</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Authorities have halted oil export\\f...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Iraq Halts Oil Exports from Main Southern Pipe...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Oil prices soar to all-time record, posing new...</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>AFP - Tearaway world oil prices, toppling reco...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Oil prices soar to all-time record, posing new...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Stocks End Up, But Near Year Lows (Reuters)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Stocks ended slightly higher on Frid...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Stocks End Up, But Near Year Lows (Reuters)|Re...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Money Funds Fell in Latest Week (AP)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>AP - Assets of the nation's retail money marke...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Money Funds Fell in Latest Week (AP)|AP - Asse...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Fed minutes show dissent over inflation (USATO...</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>USATODAY.com - Retail sales bounced back a bit...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Fed minutes show dissent over inflation (USATO...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Safety Net (Forbes.com)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Forbes.com - After earning a PH.D. in Sociolog...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Safety Net (Forbes.com)|Forbes.com - After ear...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Uprising Keeps Iraq Oil Exports Halved</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>BAGHDAD (Reuters) - Iraq's oil exports were s...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Uprising Keeps Iraq Oil Exports Halved| BAGHDA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Wall St. Pullback Reflects Tech Blowout</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK (Reuters) - Wall Street's long-playi...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:24</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Wall St. Pullback Reflects Tech Blowout| NEW Y...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Wall St. Bears Claw Back Into the Black</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK (Reuters) - Short-sellers, Wall Stre...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Wall St. Bears Claw Back Into the Black| NEW Y...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Carlyle Looks Toward Commercial Aerospace</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK (Reuters) - Private investment firm ...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:27</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Carlyle Looks Toward Commercial Aerospace| NEW...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Oil and Economy Cloud Stocks' Outlook</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK (Reuters) - Soaring crude prices plu...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Oil and Economy Cloud Stocks' Outlook| NEW YOR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>GlaxoSmithKline: CFO to Retire in March</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>LONDON (Reuters) - GlaxoSmithKline &lt;GSK.L&gt; Ch...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GlaxoSmithKline: CFO to Retire in March| LONDO...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>No Need for OPEC to Pump More-Iran Gov</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>TEHRAN (Reuters) - OPEC can do nothing to dou...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:32</td>\n",
       "      <td>NaN</td>\n",
       "      <td>No Need for OPEC to Pump More-Iran Gov| TEHRAN...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Non-OPEC Nations Should Up Output-Purnomo</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>JAKARTA (Reuters) - Non-OPEC oil exporters sh...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:34</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Non-OPEC Nations Should Up Output-Purnomo| JAK...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Google IPO Auction Off to Rocky Start</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>WASHINGTON/NEW YORK (Reuters) - The auction f...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Google IPO Auction Off to Rocky Start| WASHING...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Dollar Falls Broadly on Record Trade Gap</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK (Reuters) - The dollar tumbled broad...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:37</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Dollar Falls Broadly on Record Trade Gap| NEW ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Washington Post Business</td>\n",
       "      <td>http://www.washingtonpost.com/wp-dyn/articles/...</td>\n",
       "      <td>Rescuing an Old Saver</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>If you think you may need to help your elderly...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:40</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Rescuing an Old Saver|If you think you may nee...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Washington Post Business</td>\n",
       "      <td>http://www.washingtonpost.com/wp-dyn/articles/...</td>\n",
       "      <td>Kids Rule for Back-to-School</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>The purchasing power of kids is a big part of ...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:43</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Kids Rule for Back-to-School|The purchasing po...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Washington Post Business</td>\n",
       "      <td>http://www.washingtonpost.com/wp-dyn/articles/...</td>\n",
       "      <td>In a Down Market, Head Toward Value Funds</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>There is little cause for celebration in the s...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-08-18 22:53:46</td>\n",
       "      <td>NaN</td>\n",
       "      <td>In a Down Market, Head Toward Value Funds|Ther...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>US trade deficit swells in June</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>The US trade deficit has exploded 19 to a reco...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>US trade deficit swells in June|The US trade d...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Crude oil prices jump to new high</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>The price of crude oil reaches new record leve...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Crude oil prices jump to new high|The price of...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Shell 'could be target for Total'</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Oil giant Shell could be bracing itself for a ...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Shell 'could be target for Total'|Oil giant Sh...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Google IPO faces Playboy slip-up</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>The bidding gets underway for Google's public ...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Google IPO faces Playboy slip-up|The bidding g...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Eurozone economy keeps growing</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Official figures show the 12-nation eurozone e...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Eurozone economy keeps growing|Official figure...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Swatch dismisses tax allegations</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Swiss watchmaker Swatch, official timekeeper t...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Swatch dismisses tax allegations|Swiss watchma...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Expansion slows in Japan</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Economic growth in Japan slows down as the cou...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Expansion slows in Japan|Economic growth in Ja...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295229</th>\n",
       "      <td>USA Today</td>\n",
       "      <td>http://www.usatoday.com/money/industries/techn...</td>\n",
       "      <td>Google stock shoots, scores, up 120 from IPO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>For anyone who misses the late  #39;90s, there...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Google stock shoots, scores, up 120 from IPO|F...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295230</th>\n",
       "      <td>Atlanta Business Chronicle</td>\n",
       "      <td>http://www.11alive.com/news/news_article.aspx?...</td>\n",
       "      <td>Delta likely to land in 1 of 3 courts</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>With the likelihood of bankruptcy growing stro...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-10-26 06:46:22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Delta likely to land in 1 of 3 courts|With the...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295247</th>\n",
       "      <td>Reuters Business</td>\n",
       "      <td>http://www.reuters.com/newsArticle.jhtml?type=...</td>\n",
       "      <td>Murdoch Wins Vote to Shift News Corp.</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>ADELAIDE (Reuters) - Rupert Murdoch won share...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Murdoch Wins Vote to Shift News Corp.| ADELAID...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295248</th>\n",
       "      <td>Pittsburgh Post Gazette</td>\n",
       "      <td>http://servihoo.com/channels/kinews/v3news_det...</td>\n",
       "      <td>Huge deal swallows International Steel Group</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>The  quot;bigger is better quot; mind-set domi...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Huge deal swallows International Steel Group|T...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295249</th>\n",
       "      <td>Today (Singapore)</td>\n",
       "      <td>http://www.todayonline.com/articles/28309.asp</td>\n",
       "      <td>Election uncertainty hits US stocks</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>NEW YORK - US stocks closed slightly lower yes...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Election uncertainty hits US stocks|NEW YORK -...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295250</th>\n",
       "      <td>Indianapolis Star</td>\n",
       "      <td>http://www.indystar.com/articles/5/189172-3665...</td>\n",
       "      <td>ATA may be close to filing bankruptcy</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>ATA Airlines Inc. named its top financial exec...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ATA may be close to filing bankruptcy|ATA Airl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295251</th>\n",
       "      <td>Business Report</td>\n",
       "      <td>http://www.busrep.co.za/index.php?fSectionId= ...</td>\n",
       "      <td>Harmony cagey about its war chest for Gold Fields</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>Harmony, the listed gold miner, has not put a ...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-10-26 07:01:29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Harmony cagey about its war chest for Gold Fie...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295252</th>\n",
       "      <td>Information Week</td>\n",
       "      <td>http://www.thehindubusinessline.com/ew/2004/10...</td>\n",
       "      <td>Search Vs. Security</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>Google Inc. #39;s Desktop Search software does...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Search Vs. Security|Google Inc. #39;s Desktop ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295253</th>\n",
       "      <td>Pittsburgh Post Gazette</td>\n",
       "      <td>http://servihoo.com/channels/kinews/v3news_det...</td>\n",
       "      <td>Profit-taking depresses oil prices</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>Crude oil futures slipped yesterday as Norwegi...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-10-26 07:07:52</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Profit-taking depresses oil prices|Crude oil f...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295286</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Oil Prices Dip Toward  #36;54 (Reuters)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Oil prices slipped further below  #3...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-10-26 07:10:39</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Oil Prices Dip Toward  #36;54 (Reuters)|Reuter...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295287</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Murdoch Wins Vote to Shift News Corp. (Reuters)</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Reuters - Rupert Murdoch won shareholder\\appro...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Murdoch Wins Vote to Shift News Corp. (Reuters...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295291</th>\n",
       "      <td>Business Report</td>\n",
       "      <td>http://servihoo.com/channels/kinews/v3news_det...</td>\n",
       "      <td>Ispat in \\$4.5bn deal to form biggest steel firm</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>London - Indian-born billionaire Lakshmi Mitta...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Ispat in \\$4.5bn deal to form biggest steel fi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295292</th>\n",
       "      <td>MSN Money</td>\n",
       "      <td>http://moneycentral.msn.com/inc/news/breakingr...</td>\n",
       "      <td>American Express Profit Rises 14 Percent</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>American Express Co. said on Monday its quarte...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>American Express Profit Rises 14 Percent|Ameri...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295293</th>\n",
       "      <td>Atlanta Journal Constitution (su</td>\n",
       "      <td>http://www.cio-today.com/story.xhtml?story_tit...</td>\n",
       "      <td>Cingular deal clears hurdle</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>Cingular Wireless moved a step closer to its p...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cingular deal clears hurdle|Cingular Wireless ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295294</th>\n",
       "      <td>CNN International</td>\n",
       "      <td>http://edition.cnn.com/2004/BUSINESS/10/26/aus...</td>\n",
       "      <td>Woolies wins \\$1 billion pub battle</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>Australia #39;s biggest supermarket group, Woo...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Woolies wins \\$1 billion pub battle|Australia ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295295</th>\n",
       "      <td>Detroit Free Press</td>\n",
       "      <td>http://business.newsfactor.com/story.xhtml?sto...</td>\n",
       "      <td>Kellogg raises earnings forecast for year afte...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>Kellogg Co. says it expects low single-digit g...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Kellogg raises earnings forecast for year afte...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295343</th>\n",
       "      <td>New York Times Business</td>\n",
       "      <td>http://www.nytimes.com/2004/10/26/business/26s...</td>\n",
       "      <td>Retail Stores Feel the Pinch of Cargo Caught i...</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Retailers gearing up for their annual sales pu...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Retail Stores Feel the Pinch of Cargo Caught i...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295344</th>\n",
       "      <td>Ananova  Business</td>\n",
       "      <td>http://www.newsisfree.com/iclick/i,58327675,16...</td>\n",
       "      <td>Last Operating Selby Coal Pit Closes</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295346</th>\n",
       "      <td>Ananova  Business</td>\n",
       "      <td>http://www.newsisfree.com/iclick/i,58324292,16...</td>\n",
       "      <td>Banks Face Grilling Over Credit Cards</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295348</th>\n",
       "      <td>Ananova  Business</td>\n",
       "      <td>http://www.edapebaf.com/clickthrough.cool?db=c...</td>\n",
       "      <td>ADV: Mortgage Rates Drop to 6-Month Low</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>\\$150,000 mortgage for only \\$610/month. Lower...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-10-26 07:17:12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ADV: Mortgage Rates Drop to 6-Month Low|\\$150,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295355</th>\n",
       "      <td>East Valley Tribune</td>\n",
       "      <td>http://business.newsfactor.com/story.xhtml?sto...</td>\n",
       "      <td>Sanctions suspended, not lifted</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>Tucked into that \\$136 billion in corporate ta...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Sanctions suspended, not lifted|Tucked into th...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295356</th>\n",
       "      <td>Business Day, South Africa</td>\n",
       "      <td>http://www.falkland-malvinas.com/Detalle.asp?N...</td>\n",
       "      <td>#39;Fuels taking growing share of goods trade...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>GENEVA - Data released by the World Trade Orga...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>#39;Fuels taking growing share of goods trade...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295357</th>\n",
       "      <td>New York Times</td>\n",
       "      <td>http://www.11alive.com/news/news_article.aspx?...</td>\n",
       "      <td>Delta Negotiates Two Deals, But Not One With I...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>Delta Air Lines announced two financing deals ...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-10-26 07:25:18</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Delta Negotiates Two Deals, But Not One With I...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295358</th>\n",
       "      <td>Yahoo News</td>\n",
       "      <td>http://edition.cnn.com/2004/BUSINESS/10/26/aus...</td>\n",
       "      <td>UPDATE: Coles Blinks First In Australian Pubs ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>SYDNEY (Dow Jones)--Coles Myer Ltd. (CML.AU) b...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>UPDATE: Coles Blinks First In Australian Pubs ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295359</th>\n",
       "      <td>New York Times</td>\n",
       "      <td>http://www.cio-today.com/story.xhtml?story_tit...</td>\n",
       "      <td>EDS Delays Its Results After a Dispute</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>The Electronic Data Systems Corporation announ...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>EDS Delays Its Results After a Dispute|The Ele...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295360</th>\n",
       "      <td>New York Times</td>\n",
       "      <td>http://business.newsfactor.com/story.xhtml?sto...</td>\n",
       "      <td>Kellogg Cites New Diet Foods for Profit Rise</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>The Kellogg Company said yesterday that third-...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Kellogg Cites New Diet Foods for Profit Rise|T...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295393</th>\n",
       "      <td>BBC News Business</td>\n",
       "      <td>http://news.bbc.co.uk/go/click/rss/0.91/public...</td>\n",
       "      <td>Cuba bans US dollar transactions</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>Cuba announces a ban on commercial transaction...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cuba bans US dollar transactions|Cuba announce...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295397</th>\n",
       "      <td>Yahoo Business</td>\n",
       "      <td>http://us.rd.yahoo.com/dailynews/rss/business/...</td>\n",
       "      <td>Disasters come at a bad time for Japan economy...</td>\n",
       "      <td>none</td>\n",
       "      <td>Business</td>\n",
       "      <td>AFP - The recent series of typhoons and tremor...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Disasters come at a bad time for Japan economy...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295400</th>\n",
       "      <td>CBS MarketWatch</td>\n",
       "      <td>http://www.ctv.ca/servlet/ArticleNews/story/CT...</td>\n",
       "      <td>Cherkasky to take over as CEO</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>SAN FRANCISCO (CBS.MW) -- Marsh  amp; McLennan...</td>\n",
       "      <td>5</td>\n",
       "      <td>2004-10-26 07:34:34</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Cherkasky to take over as CEO|SAN FRANCISCO (C...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>295401</th>\n",
       "      <td>The Scotsman</td>\n",
       "      <td>http://servihoo.com/channels/kinews/v3news_det...</td>\n",
       "      <td>Norway Takes the Sting Out of Oil Prices</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Business</td>\n",
       "      <td>Crude oil futures inched lower today, after No...</td>\n",
       "      <td>5</td>\n",
       "      <td>0000-00-00 00:00:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Norway Takes the Sting Out of Oil Prices|Crude...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>127600 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                  source  \\\n",
       "0                         Yahoo Business   \n",
       "1                         Yahoo Business   \n",
       "2                         Yahoo Business   \n",
       "3                         Yahoo Business   \n",
       "4                         Yahoo Business   \n",
       "5                         Yahoo Business   \n",
       "6                         Yahoo Business   \n",
       "7                         Yahoo Business   \n",
       "8                         Yahoo Business   \n",
       "9                         Yahoo Business   \n",
       "10                      Reuters Business   \n",
       "11                      Reuters Business   \n",
       "12                      Reuters Business   \n",
       "13                      Reuters Business   \n",
       "14                      Reuters Business   \n",
       "15                      Reuters Business   \n",
       "16                      Reuters Business   \n",
       "17                      Reuters Business   \n",
       "18                      Reuters Business   \n",
       "19                      Reuters Business   \n",
       "20              Washington Post Business   \n",
       "21              Washington Post Business   \n",
       "22              Washington Post Business   \n",
       "23                     BBC News Business   \n",
       "24                     BBC News Business   \n",
       "25                     BBC News Business   \n",
       "26                     BBC News Business   \n",
       "27                     BBC News Business   \n",
       "28                     BBC News Business   \n",
       "29                     BBC News Business   \n",
       "...                                  ...   \n",
       "295229                         USA Today   \n",
       "295230        Atlanta Business Chronicle   \n",
       "295247                  Reuters Business   \n",
       "295248           Pittsburgh Post Gazette   \n",
       "295249                 Today (Singapore)   \n",
       "295250                 Indianapolis Star   \n",
       "295251                   Business Report   \n",
       "295252                  Information Week   \n",
       "295253           Pittsburgh Post Gazette   \n",
       "295286                    Yahoo Business   \n",
       "295287                    Yahoo Business   \n",
       "295291                   Business Report   \n",
       "295292                         MSN Money   \n",
       "295293  Atlanta Journal Constitution (su   \n",
       "295294                 CNN International   \n",
       "295295                Detroit Free Press   \n",
       "295343           New York Times Business   \n",
       "295344                 Ananova  Business   \n",
       "295346                 Ananova  Business   \n",
       "295348                 Ananova  Business   \n",
       "295355               East Valley Tribune   \n",
       "295356        Business Day, South Africa   \n",
       "295357                    New York Times   \n",
       "295358                        Yahoo News   \n",
       "295359                    New York Times   \n",
       "295360                    New York Times   \n",
       "295393                 BBC News Business   \n",
       "295397                    Yahoo Business   \n",
       "295400                   CBS MarketWatch   \n",
       "295401                      The Scotsman   \n",
       "\n",
       "                                                      url  \\\n",
       "0       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "1       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "2       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "3       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "4       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "5       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "6       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "7       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "8       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "9       http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "10      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "11      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "12      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "13      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "14      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "15      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "16      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "17      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "18      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "19      http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "20      http://www.washingtonpost.com/wp-dyn/articles/...   \n",
       "21      http://www.washingtonpost.com/wp-dyn/articles/...   \n",
       "22      http://www.washingtonpost.com/wp-dyn/articles/...   \n",
       "23      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "24      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "25      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "26      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "27      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "28      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "29      http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "...                                                   ...   \n",
       "295229  http://www.usatoday.com/money/industries/techn...   \n",
       "295230  http://www.11alive.com/news/news_article.aspx?...   \n",
       "295247  http://www.reuters.com/newsArticle.jhtml?type=...   \n",
       "295248  http://servihoo.com/channels/kinews/v3news_det...   \n",
       "295249      http://www.todayonline.com/articles/28309.asp   \n",
       "295250  http://www.indystar.com/articles/5/189172-3665...   \n",
       "295251  http://www.busrep.co.za/index.php?fSectionId= ...   \n",
       "295252  http://www.thehindubusinessline.com/ew/2004/10...   \n",
       "295253  http://servihoo.com/channels/kinews/v3news_det...   \n",
       "295286  http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "295287  http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "295291  http://servihoo.com/channels/kinews/v3news_det...   \n",
       "295292  http://moneycentral.msn.com/inc/news/breakingr...   \n",
       "295293  http://www.cio-today.com/story.xhtml?story_tit...   \n",
       "295294  http://edition.cnn.com/2004/BUSINESS/10/26/aus...   \n",
       "295295  http://business.newsfactor.com/story.xhtml?sto...   \n",
       "295343  http://www.nytimes.com/2004/10/26/business/26s...   \n",
       "295344  http://www.newsisfree.com/iclick/i,58327675,16...   \n",
       "295346  http://www.newsisfree.com/iclick/i,58324292,16...   \n",
       "295348  http://www.edapebaf.com/clickthrough.cool?db=c...   \n",
       "295355  http://business.newsfactor.com/story.xhtml?sto...   \n",
       "295356  http://www.falkland-malvinas.com/Detalle.asp?N...   \n",
       "295357  http://www.11alive.com/news/news_article.aspx?...   \n",
       "295358  http://edition.cnn.com/2004/BUSINESS/10/26/aus...   \n",
       "295359  http://www.cio-today.com/story.xhtml?story_tit...   \n",
       "295360  http://business.newsfactor.com/story.xhtml?sto...   \n",
       "295393  http://news.bbc.co.uk/go/click/rss/0.91/public...   \n",
       "295397  http://us.rd.yahoo.com/dailynews/rss/business/...   \n",
       "295400  http://www.ctv.ca/servlet/ArticleNews/story/CT...   \n",
       "295401  http://servihoo.com/channels/kinews/v3news_det...   \n",
       "\n",
       "                                                    title image  category  \\\n",
       "0       Wall St. Pullback Reflects Tech Blowout (Reuters)  none  Business   \n",
       "1       Wall St. Bears Claw Back Into the Black (Reuters)  none  Business   \n",
       "2       Carlyle Looks Toward Commercial Aerospace (Reu...  none  Business   \n",
       "3         Oil and Economy Cloud Stocks' Outlook (Reuters)  none  Business   \n",
       "4       Iraq Halts Oil Exports from Main Southern Pipe...  none  Business   \n",
       "5       Oil prices soar to all-time record, posing new...  none  Business   \n",
       "6             Stocks End Up, But Near Year Lows (Reuters)  none  Business   \n",
       "7                    Money Funds Fell in Latest Week (AP)  none  Business   \n",
       "8       Fed minutes show dissent over inflation (USATO...  none  Business   \n",
       "9                                 Safety Net (Forbes.com)  none  Business   \n",
       "10                 Uprising Keeps Iraq Oil Exports Halved  none  Business   \n",
       "11                Wall St. Pullback Reflects Tech Blowout  none  Business   \n",
       "12                Wall St. Bears Claw Back Into the Black  none  Business   \n",
       "13              Carlyle Looks Toward Commercial Aerospace  none  Business   \n",
       "14                  Oil and Economy Cloud Stocks' Outlook  none  Business   \n",
       "15                GlaxoSmithKline: CFO to Retire in March  none  Business   \n",
       "16                 No Need for OPEC to Pump More-Iran Gov  none  Business   \n",
       "17              Non-OPEC Nations Should Up Output-Purnomo  none  Business   \n",
       "18                  Google IPO Auction Off to Rocky Start  none  Business   \n",
       "19               Dollar Falls Broadly on Record Trade Gap  none  Business   \n",
       "20                                  Rescuing an Old Saver  none  Business   \n",
       "21                           Kids Rule for Back-to-School  none  Business   \n",
       "22              In a Down Market, Head Toward Value Funds  none  Business   \n",
       "23                        US trade deficit swells in June  none  Business   \n",
       "24                      Crude oil prices jump to new high  none  Business   \n",
       "25                      Shell 'could be target for Total'  none  Business   \n",
       "26                       Google IPO faces Playboy slip-up  none  Business   \n",
       "27                         Eurozone economy keeps growing  none  Business   \n",
       "28                       Swatch dismisses tax allegations  none  Business   \n",
       "29                               Expansion slows in Japan  none  Business   \n",
       "...                                                   ...   ...       ...   \n",
       "295229       Google stock shoots, scores, up 120 from IPO   NaN  Business   \n",
       "295230              Delta likely to land in 1 of 3 courts   NaN  Business   \n",
       "295247              Murdoch Wins Vote to Shift News Corp.  none  Business   \n",
       "295248       Huge deal swallows International Steel Group   NaN  Business   \n",
       "295249                Election uncertainty hits US stocks   NaN  Business   \n",
       "295250              ATA may be close to filing bankruptcy   NaN  Business   \n",
       "295251  Harmony cagey about its war chest for Gold Fields   NaN  Business   \n",
       "295252                                Search Vs. Security   NaN  Business   \n",
       "295253                 Profit-taking depresses oil prices   NaN  Business   \n",
       "295286            Oil Prices Dip Toward  #36;54 (Reuters)  none  Business   \n",
       "295287    Murdoch Wins Vote to Shift News Corp. (Reuters)  none  Business   \n",
       "295291   Ispat in \\$4.5bn deal to form biggest steel firm   NaN  Business   \n",
       "295292           American Express Profit Rises 14 Percent   NaN  Business   \n",
       "295293                        Cingular deal clears hurdle   NaN  Business   \n",
       "295294                Woolies wins \\$1 billion pub battle   NaN  Business   \n",
       "295295  Kellogg raises earnings forecast for year afte...   NaN  Business   \n",
       "295343  Retail Stores Feel the Pinch of Cargo Caught i...  none  Business   \n",
       "295344               Last Operating Selby Coal Pit Closes  none  Business   \n",
       "295346              Banks Face Grilling Over Credit Cards  none  Business   \n",
       "295348            ADV: Mortgage Rates Drop to 6-Month Low  none  Business   \n",
       "295355                    Sanctions suspended, not lifted   NaN  Business   \n",
       "295356   #39;Fuels taking growing share of goods trade...   NaN  Business   \n",
       "295357  Delta Negotiates Two Deals, But Not One With I...   NaN  Business   \n",
       "295358  UPDATE: Coles Blinks First In Australian Pubs ...   NaN  Business   \n",
       "295359             EDS Delays Its Results After a Dispute   NaN  Business   \n",
       "295360       Kellogg Cites New Diet Foods for Profit Rise   NaN  Business   \n",
       "295393                   Cuba bans US dollar transactions  none  Business   \n",
       "295397  Disasters come at a bad time for Japan economy...  none  Business   \n",
       "295400                      Cherkasky to take over as CEO   NaN  Business   \n",
       "295401           Norway Takes the Sting Out of Oil Prices   NaN  Business   \n",
       "\n",
       "                                              description rank  \\\n",
       "0       Reuters - Wall Street's long-playing drama,\\\"W...    5   \n",
       "1       Reuters - Short-sellers, Wall Street's dwindli...    5   \n",
       "2       Reuters - Private investment firm Carlyle Grou...    5   \n",
       "3       Reuters - Soaring crude prices plus worries\\ab...    5   \n",
       "4       Reuters - Authorities have halted oil export\\f...    5   \n",
       "5       AFP - Tearaway world oil prices, toppling reco...    5   \n",
       "6       Reuters - Stocks ended slightly higher on Frid...    5   \n",
       "7       AP - Assets of the nation's retail money marke...    5   \n",
       "8       USATODAY.com - Retail sales bounced back a bit...    5   \n",
       "9       Forbes.com - After earning a PH.D. in Sociolog...    5   \n",
       "10       BAGHDAD (Reuters) - Iraq's oil exports were s...    5   \n",
       "11       NEW YORK (Reuters) - Wall Street's long-playi...    5   \n",
       "12       NEW YORK (Reuters) - Short-sellers, Wall Stre...    5   \n",
       "13       NEW YORK (Reuters) - Private investment firm ...    5   \n",
       "14       NEW YORK (Reuters) - Soaring crude prices plu...    5   \n",
       "15       LONDON (Reuters) - GlaxoSmithKline <GSK.L> Ch...    5   \n",
       "16       TEHRAN (Reuters) - OPEC can do nothing to dou...    5   \n",
       "17       JAKARTA (Reuters) - Non-OPEC oil exporters sh...    5   \n",
       "18       WASHINGTON/NEW YORK (Reuters) - The auction f...    5   \n",
       "19       NEW YORK (Reuters) - The dollar tumbled broad...    5   \n",
       "20      If you think you may need to help your elderly...    5   \n",
       "21      The purchasing power of kids is a big part of ...    5   \n",
       "22      There is little cause for celebration in the s...    5   \n",
       "23      The US trade deficit has exploded 19 to a reco...    5   \n",
       "24      The price of crude oil reaches new record leve...    5   \n",
       "25      Oil giant Shell could be bracing itself for a ...    5   \n",
       "26      The bidding gets underway for Google's public ...    5   \n",
       "27      Official figures show the 12-nation eurozone e...    5   \n",
       "28      Swiss watchmaker Swatch, official timekeeper t...    5   \n",
       "29      Economic growth in Japan slows down as the cou...    5   \n",
       "...                                                   ...  ...   \n",
       "295229  For anyone who misses the late  #39;90s, there...    5   \n",
       "295230  With the likelihood of bankruptcy growing stro...    5   \n",
       "295247   ADELAIDE (Reuters) - Rupert Murdoch won share...    5   \n",
       "295248  The  quot;bigger is better quot; mind-set domi...    5   \n",
       "295249  NEW YORK - US stocks closed slightly lower yes...    5   \n",
       "295250  ATA Airlines Inc. named its top financial exec...    5   \n",
       "295251  Harmony, the listed gold miner, has not put a ...    5   \n",
       "295252  Google Inc. #39;s Desktop Search software does...    5   \n",
       "295253  Crude oil futures slipped yesterday as Norwegi...    5   \n",
       "295286  Reuters - Oil prices slipped further below  #3...    5   \n",
       "295287  Reuters - Rupert Murdoch won shareholder\\appro...    5   \n",
       "295291  London - Indian-born billionaire Lakshmi Mitta...    5   \n",
       "295292  American Express Co. said on Monday its quarte...    5   \n",
       "295293  Cingular Wireless moved a step closer to its p...    5   \n",
       "295294  Australia #39;s biggest supermarket group, Woo...    5   \n",
       "295295  Kellogg Co. says it expects low single-digit g...    5   \n",
       "295343  Retailers gearing up for their annual sales pu...    5   \n",
       "295344                                                NaN  NaN   \n",
       "295346                                                NaN  NaN   \n",
       "295348  \\$150,000 mortgage for only \\$610/month. Lower...    5   \n",
       "295355  Tucked into that \\$136 billion in corporate ta...    5   \n",
       "295356  GENEVA - Data released by the World Trade Orga...    5   \n",
       "295357  Delta Air Lines announced two financing deals ...    5   \n",
       "295358  SYDNEY (Dow Jones)--Coles Myer Ltd. (CML.AU) b...    5   \n",
       "295359  The Electronic Data Systems Corporation announ...    5   \n",
       "295360  The Kellogg Company said yesterday that third-...    5   \n",
       "295393  Cuba announces a ban on commercial transaction...    5   \n",
       "295397  AFP - The recent series of typhoons and tremor...    5   \n",
       "295400  SAN FRANCISCO (CBS.MW) -- Marsh  amp; McLennan...    5   \n",
       "295401  Crude oil futures inched lower today, after No...    5   \n",
       "\n",
       "                    pubdate  video  \\\n",
       "0       0000-00-00 00:00:00    NaN   \n",
       "1       0000-00-00 00:00:00    NaN   \n",
       "2       0000-00-00 00:00:00    NaN   \n",
       "3       0000-00-00 00:00:00    NaN   \n",
       "4       0000-00-00 00:00:00    NaN   \n",
       "5       0000-00-00 00:00:00    NaN   \n",
       "6       0000-00-00 00:00:00    NaN   \n",
       "7       0000-00-00 00:00:00    NaN   \n",
       "8       0000-00-00 00:00:00    NaN   \n",
       "9       0000-00-00 00:00:00    NaN   \n",
       "10      0000-00-00 00:00:00    NaN   \n",
       "11      2004-08-18 22:53:24    NaN   \n",
       "12      0000-00-00 00:00:00    NaN   \n",
       "13      2004-08-18 22:53:27    NaN   \n",
       "14      0000-00-00 00:00:00    NaN   \n",
       "15      2004-08-18 22:53:29    NaN   \n",
       "16      2004-08-18 22:53:32    NaN   \n",
       "17      2004-08-18 22:53:34    NaN   \n",
       "18      0000-00-00 00:00:00    NaN   \n",
       "19      2004-08-18 22:53:37    NaN   \n",
       "20      2004-08-18 22:53:40    NaN   \n",
       "21      2004-08-18 22:53:43    NaN   \n",
       "22      2004-08-18 22:53:46    NaN   \n",
       "23      0000-00-00 00:00:00    NaN   \n",
       "24      0000-00-00 00:00:00    NaN   \n",
       "25      0000-00-00 00:00:00    NaN   \n",
       "26      0000-00-00 00:00:00    NaN   \n",
       "27      0000-00-00 00:00:00    NaN   \n",
       "28      0000-00-00 00:00:00    NaN   \n",
       "29      0000-00-00 00:00:00    NaN   \n",
       "...                     ...    ...   \n",
       "295229  0000-00-00 00:00:00    NaN   \n",
       "295230  2004-10-26 06:46:22    NaN   \n",
       "295247  0000-00-00 00:00:00    NaN   \n",
       "295248  0000-00-00 00:00:00    NaN   \n",
       "295249  0000-00-00 00:00:00    NaN   \n",
       "295250  0000-00-00 00:00:00    NaN   \n",
       "295251  2004-10-26 07:01:29    NaN   \n",
       "295252  0000-00-00 00:00:00    NaN   \n",
       "295253  2004-10-26 07:07:52    NaN   \n",
       "295286  2004-10-26 07:10:39    NaN   \n",
       "295287  0000-00-00 00:00:00    NaN   \n",
       "295291  0000-00-00 00:00:00    NaN   \n",
       "295292  0000-00-00 00:00:00    NaN   \n",
       "295293  0000-00-00 00:00:00    NaN   \n",
       "295294  0000-00-00 00:00:00    NaN   \n",
       "295295  0000-00-00 00:00:00    NaN   \n",
       "295343  0000-00-00 00:00:00    NaN   \n",
       "295344                  NaN    NaN   \n",
       "295346                  NaN    NaN   \n",
       "295348  2004-10-26 07:17:12    NaN   \n",
       "295355  0000-00-00 00:00:00    NaN   \n",
       "295356  0000-00-00 00:00:00    NaN   \n",
       "295357  2004-10-26 07:25:18    NaN   \n",
       "295358  0000-00-00 00:00:00    NaN   \n",
       "295359  0000-00-00 00:00:00    NaN   \n",
       "295360  0000-00-00 00:00:00    NaN   \n",
       "295393  0000-00-00 00:00:00    NaN   \n",
       "295397  0000-00-00 00:00:00    NaN   \n",
       "295400  2004-10-26 07:34:34    NaN   \n",
       "295401  0000-00-00 00:00:00    NaN   \n",
       "\n",
       "                                                  feature  \n",
       "0       Wall St. Pullback Reflects Tech Blowout (Reute...  \n",
       "1       Wall St. Bears Claw Back Into the Black (Reute...  \n",
       "2       Carlyle Looks Toward Commercial Aerospace (Reu...  \n",
       "3       Oil and Economy Cloud Stocks' Outlook (Reuters...  \n",
       "4       Iraq Halts Oil Exports from Main Southern Pipe...  \n",
       "5       Oil prices soar to all-time record, posing new...  \n",
       "6       Stocks End Up, But Near Year Lows (Reuters)|Re...  \n",
       "7       Money Funds Fell in Latest Week (AP)|AP - Asse...  \n",
       "8       Fed minutes show dissent over inflation (USATO...  \n",
       "9       Safety Net (Forbes.com)|Forbes.com - After ear...  \n",
       "10      Uprising Keeps Iraq Oil Exports Halved| BAGHDA...  \n",
       "11      Wall St. Pullback Reflects Tech Blowout| NEW Y...  \n",
       "12      Wall St. Bears Claw Back Into the Black| NEW Y...  \n",
       "13      Carlyle Looks Toward Commercial Aerospace| NEW...  \n",
       "14      Oil and Economy Cloud Stocks' Outlook| NEW YOR...  \n",
       "15      GlaxoSmithKline: CFO to Retire in March| LONDO...  \n",
       "16      No Need for OPEC to Pump More-Iran Gov| TEHRAN...  \n",
       "17      Non-OPEC Nations Should Up Output-Purnomo| JAK...  \n",
       "18      Google IPO Auction Off to Rocky Start| WASHING...  \n",
       "19      Dollar Falls Broadly on Record Trade Gap| NEW ...  \n",
       "20      Rescuing an Old Saver|If you think you may nee...  \n",
       "21      Kids Rule for Back-to-School|The purchasing po...  \n",
       "22      In a Down Market, Head Toward Value Funds|Ther...  \n",
       "23      US trade deficit swells in June|The US trade d...  \n",
       "24      Crude oil prices jump to new high|The price of...  \n",
       "25      Shell 'could be target for Total'|Oil giant Sh...  \n",
       "26      Google IPO faces Playboy slip-up|The bidding g...  \n",
       "27      Eurozone economy keeps growing|Official figure...  \n",
       "28      Swatch dismisses tax allegations|Swiss watchma...  \n",
       "29      Expansion slows in Japan|Economic growth in Ja...  \n",
       "...                                                   ...  \n",
       "295229  Google stock shoots, scores, up 120 from IPO|F...  \n",
       "295230  Delta likely to land in 1 of 3 courts|With the...  \n",
       "295247  Murdoch Wins Vote to Shift News Corp.| ADELAID...  \n",
       "295248  Huge deal swallows International Steel Group|T...  \n",
       "295249  Election uncertainty hits US stocks|NEW YORK -...  \n",
       "295250  ATA may be close to filing bankruptcy|ATA Airl...  \n",
       "295251  Harmony cagey about its war chest for Gold Fie...  \n",
       "295252  Search Vs. Security|Google Inc. #39;s Desktop ...  \n",
       "295253  Profit-taking depresses oil prices|Crude oil f...  \n",
       "295286  Oil Prices Dip Toward  #36;54 (Reuters)|Reuter...  \n",
       "295287  Murdoch Wins Vote to Shift News Corp. (Reuters...  \n",
       "295291  Ispat in \\$4.5bn deal to form biggest steel fi...  \n",
       "295292  American Express Profit Rises 14 Percent|Ameri...  \n",
       "295293  Cingular deal clears hurdle|Cingular Wireless ...  \n",
       "295294  Woolies wins \\$1 billion pub battle|Australia ...  \n",
       "295295  Kellogg raises earnings forecast for year afte...  \n",
       "295343  Retail Stores Feel the Pinch of Cargo Caught i...  \n",
       "295344                                                NaN  \n",
       "295346                                                NaN  \n",
       "295348  ADV: Mortgage Rates Drop to 6-Month Low|\\$150,...  \n",
       "295355  Sanctions suspended, not lifted|Tucked into th...  \n",
       "295356   #39;Fuels taking growing share of goods trade...  \n",
       "295357  Delta Negotiates Two Deals, But Not One With I...  \n",
       "295358  UPDATE: Coles Blinks First In Australian Pubs ...  \n",
       "295359  EDS Delays Its Results After a Dispute|The Ele...  \n",
       "295360  Kellogg Cites New Diet Foods for Profit Rise|T...  \n",
       "295393  Cuba bans US dollar transactions|Cuba announce...  \n",
       "295397  Disasters come at a bad time for Japan economy...  \n",
       "295400  Cherkasky to take over as CEO|SAN FRANCISCO (C...  \n",
       "295401  Norway Takes the Sting Out of Oil Prices|Crude...  \n",
       "\n",
       "[127600 rows x 10 columns]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_test = newsspace[newsspace.category.isin(top4_classes)].groupby(\"category\").head(31900)\n",
    "train_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(train_test.feature, train_test.category, train_size=30000, test_size=1900, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame({\"feature\":X_train, \"label\": y_train}).to_csv(\"train.tsv\", sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame({\"feature\":X_test, \"label\": y_test}).to_csv(\"test.tsv\", sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
